
import dask.dataframe as dd
import pandas as pd
import numpy as np
from dask.diagnostics import ProgressBar

ProgressBar().register()

# Import full 2014 data

rais_2014 = dd.read_csv('rais/rais_2014.csv',
                        dtype={'radiccnpj': 'object',
                               'numectps': 'object'})

# Import matched apoios

apoios = pd.read_csv('rais/rais19_apoios.csv')

rais_apoios = rais_2014[rais_2014['CPF'].isin(apoios['CPF'])]

# Select columns to be used on matching

oncols = ['municipio', 'ocup2002', 'sexotrabalhador',
          'grinstrucao', 'CPF', 'idade']

# This merge takes around 9 mins

rais_match = dd.merge(rais_apoios[oncols], rais_2014[oncols],
                      on=['municipio', 'ocup2002', 'sexotrabalhador',
                          'grinstrucao'], how='left').compute()

# One to many matching is problematic when using propensity score, when it is based in exact matching there's no reason bias increases based on the assumptions

# single_matches = rais_match['CPF_x'].value_counts().reset_index()

# 1526 have just a single match, so themselves

# sum(single_matches['CPF_x'] == 1)

# Remains exploring which got lost due to this

# Filter matches with itself

rais_match = rais_match[rais_match['CPF_x'] != rais_match['CPF_y']]

# Distance matching on age

rais_match['idade_dif'] = abs(rais_match['idade_x'] - rais_match['idade_y'])

rais_match = rais_match.sort_values(by='idade_dif')

rais_match = rais_match.drop_duplicates(subset='CPF_y')

data_out = []

for i in rais_match['CPF_x'].unique():
    df = rais_match[rais_match['CPF_x'] == i]
    if df.shape[0] >= 5:
        data_out.append(df.iloc[:(5+1)])
    else:
        data_out.append(df)

rais_dist = pd.concat(data_out)

# Get all observations (B and matched not B) to see why there were repeated cases

bcpf = rais_dist['CPF_x'].unique()

notbcpf = rais_dist['CPF_y'].unique()

getcpfs = np.concatenate([bcpf, notbcpf])

# takes 4 mins

rais_get = rais_2014[rais_2014['CPF'].isin(getcpfs)].compute()

# During a year there seem to be different situations that create a new RAIS entry: worker is moved to a different firm? branch? (Transferencia/movimentacao do empregado/servidor, sem onus para a cedente), a worker is laid off (Rescisao sem justa causa por iniciativa do empregador) or a contract ends (Termino do contrato de trabalho)

# If the entry is not Nao desligado then empem3112 is Nao (the worker is not employed at year's end, but more likely, that entry has no information on end of year employment)

# So if there isn't a Nao Desligado / empem3112 Sim for a worker in any given year the worker was not in a registered employment relationship

# Since a worker can have several diferent situations in a single year, I might need to spread those columns: during the year they could be transferred, fired, hired under a contract that finished, hired back or unemployed

# In the matched data there are 136485 entries but 88863 CPFs, so 47k extra entries

# Some cases in which entry is repeated with all the same information and only tpvinculo is different but says almost the same (ESTAT N/EFET vs. ESTATUTARIO)

# Double entries for some workers with quite different salaries, and different occupational codes

# The safest option is to go with the end of year salary since it represents an exact point in time, but interesting to look those who lost employment in a given year and won't have an entry at year's end, and also within year trajectories

# tempempr can help to determine in these situations which one is the newest employment relationship, specially if hours don't add up to part time (case of CPF 81950837904)

# Seems these weird situations are more common in public employment, so I can just drop all those

# This article explains meaning of estatutario: https://www.concursosnobrasil.com.br/artigos/quais-sao-as-diferencas-entre-estatutario-e-clt.html

# keep only the private sector, this below filters out public employment, except SOEs

rais_clt = rais_get[-rais_get['tpvinculo'].str.contains('EST')]

rais_clt.to_csv('data/rais_matched_test_clt.csv', index=False)

# It seems that transferencias might be promotions of some sort, see CPF 1000548465, the person got a transferencia, firm is the same but workplace size changed, also average salary is higher
